library(survival)
library(stargazer)
library(multiwayvcov)
library(dplyr)
library(readxl)
library(lmtest)
library(ggpattern)
library(survMisc)
library(ggfortify)
library(vtable)
library(survminer)
library(stringr)
library(vtable)

#set working directory to correct folder
#setwd("/Users/Tanu/Library/CloudStorage/Box-Box/Smart Cities article/Replication Materials/GTREP") 
setwd("~/Box Sync/Index Blueprint2/Smart Cities Article/Replication Materials/GTREP") 
options(digits=3)


#FIGURES-----

##Figure 1----


gtfsx <- read_excel("gtfsx.xlsx") #GTFS updates over time with covariates for adopters as of 2018

gtfs <- read_excel("gtfs.xlsx") #GTFS updates over time for all adopters 

gtfs19  <- read_excel("gtfs19.xlsx")

#Create Revenue Terciles for figure 1

gtfsx$rev_cat <- ntile(gtfsx$revenue13, 3)
gtfsx$rev_cat[gtfsx$rev_cat == 1] <- "Small"
gtfsx$rev_cat[gtfsx$rev_cat == 2] <- "Medium"
gtfsx$rev_cat[gtfsx$rev_cat== 3] <- "Large"
rev_cat <- gtfsx %>% select(rev_cat,NTDID)
gtfs <- merge(rev_cat , gtfs, by.x="NTDID", by.="NTDID", all.x=FALSE, all.y=TRUE)

plotme <- unique(gtfs %>% select(NTDID, AgencyName, year, rev_cat, adopt))
plotme <- na.omit(plotme) 
#NAs are agencies that don't report revenue in 2013

#number of agencies for figure
length(unique(gtfsx$NTDID[!(is.na(gtfsx$revenue13))]))

plotme <-plotme %>% group_by(rev_cat, year) %>% summarize(sum=sum(adopt))

plotme$year <- as.character(plotme$year)
plotme$pct <- ifelse(plotme$rev_cat == "Medium", plotme$sum/length(unique(gtfsx$NTDID[gtfsx$rev_cat == "Medium"])), plotme$sum)
plotme$pct <- ifelse(plotme$rev_cat == "Large", plotme$sum/length(unique(gtfsx$NTDID[gtfsx$rev_cat == "Large"])), plotme$pct)
plotme$pct <- ifelse(plotme$rev_cat == "Small", plotme$sum/length(unique(gtfsx$NTDID[gtfsx$rev_cat == "Small"])), plotme$pct)

plotme$pct=plotme$pct*100


plotme$`Revenue Category` <- plotme$rev_cat
one <- ggplot(data = plotme, mapping = aes(x=year, y=pct, fill=`Revenue Category`, pattern=`Revenue Category` )) +
  theme(plot.title=element_text(hjust=0.5))+
  geom_bar_pattern(stat="identity", position = "dodge", 
                   color = "white", 
                   pattern_fill = "white",
                   pattern_spacing = 0.025,
                   pattern_key_scale_factor = 0.6)+ 
  guides(fill=guide_legend(title="Revenue Category"))+
  xlab("Year") + ylab("Cumulative Percentage Adopting")+
  theme_bw()+scale_pattern_manual(values = c('none', 'stripe', 'none'))+
  scale_fill_manual(values = c("black", "lightgrey","darkgrey"))#this one looks funky because so few "other" agencies


one +
  theme(plot.title=element_text(hjust=0.5))+
  guides(fill = guide_legend(override.aes = list(pattern = c('none', 'stripe', 'none'))))+
  labs(caption = "Number of Agencies = 132")

#Calculate thresholds for each tercile for footnotes
max(na.omit(gtfsx$revenue13[gtfsx$rev_cat == "Small"]))
max(na.omit(gtfsx$revenue13[gtfsx$rev_cat == "Medium"]))
max(na.omit(gtfsx$revenue13[gtfsx$rev_cat == "Large"]))

##Figure 3----
#Adoption and Revenue
#survival curve

three <- coxph(Surv(time, indicator)  ~ strata(rev_cat) + orgtype+PC13, data = gtfsx)
fig3 <- ggsurvplot(survfit(three), data = gtfsx, conf.int=TRUE,  palette = c("black", "grey38",  "grey70"), fun="event",
                   surv.scale = c("percent")) + xlab("Years past 2013") + ylab("1 - Survival Rate")

fig3

#calculate hazard rates at every point in time

# Fit the survival model
rthree<- coxph(Surv(time, indicator) ~ strata(rev_cat) + orgtype + PC13, data = gtfsx)

# Create the survfit object
surv_fit <- survfit(three)

#Extract Survival terms, manually pulling strata out of the regression
surv_data <- data.frame(
  time = surv_fit$time,
  n_risk = surv_fit$n.risk,
  n_event = surv_fit$n.event,
  survival = surv_fit$surv,
  strata = factor(rep(names(surv_fit$strata), surv_fit$strata)) # Handle strata explicitly
)

# Add 1 - Survival Rate
surv_data$`1-Survival` <- 1 - surv_data$survival

# View the survival data for all strata
print(head(surv_data))

# Filter for each level of rev_cat (adjust strata names if needed)
levels(surv_data$strata)  # Check strata names
surv_data_small <- subset(surv_data, strata == "Small")
surv_data_medium <- subset(surv_data, strata == "Medium")
surv_data_large <- subset(surv_data, strata == "Large")

# View the extracted data for each group
print(head(surv_data_small))
print(head(surv_data_medium))
print(head(surv_data_large))

#Calculate thresholds for each tercile for footnotes
max(na.omit(gtfsx$revenue13[gtfsx$rev_cat == "Small"]))
max(na.omit(gtfsx$revenue13[gtfsx$rev_cat == "Medium"]))
max(na.omit(gtfsx$revenue13[gtfsx$rev_cat == "Large"]))

#Review data frame outputs to identify 1-survival rate at 1 and 3 years

##Figure 4----
#Maintenance conditional on Income and Age Group

plotme = unique(gtfs19 %>% select(NTDID, age, routes, AgencyName, yrmo, updates, `routes/updates`))
plotme$Age <- ntile(plotme$age, 3)
plotme$Age[plotme$Age == 1] <- "1. Low"
plotme$Age[plotme$Age == 2] <- "2. Medium"
plotme$Age[plotme$Age== 3] <- "3. High"
plotme_age <-na.omit(plotme %>% group_by(yrmo, Age) %>% summarize(sum=sum(updates), n = length(unique(AgencyName)))) 

#Quarterly time
plotme_age$Time <- format(as.yearqtr(plotme_age$yrmo))
#sometimes N changes from 25 to 26, so we'll take the mean N in for each quarter
plotme_age <- unique(plotme_age %>% group_by(Time, Age) %>% summarize(average = sum(sum)/mean(n)))
#make year legible again
plotme_age$datenorm  <- as.Date(as.yearqtr(plotme_age$Time))
plotme_age$year <- substr(plotme_age$datenorm, 1, 4)

#calculate N 
length(unique(plotme$NTDID))

#age
four <-ggplot(data = na.omit(plotme_age), mapping = aes(x=datenorm, y=average, fill=Age)) + #y is sum plus one 
  #because otherwise the log scale gets wonky
  geom_bar(stat="identity", position = "dodge")+ guides(fill=guide_legend(title="Age Group"))+
  xlab("Year") + ylab("Average # of Quarterly Updates")     +
 theme_bw()+scale_fill_manual(values = c("black", "darkgrey","lightgrey"))
four +  theme(plot.title=element_text(hjust=0.5))+
  guides(fill = guide_legend(override.aes = list(pattern = c('none', 'stripe', 'none'))))+
  labs(caption = "Number of Agencies = 77")

#terciles for endnote 

max(na.omit(plotme$age[plotme$Age == "1. Low"]))
max(na.omit(plotme$age[plotme$Age == "2. Medium"]))
max(na.omit(plotme$age[plotme$Age == "3. High"]))

#ANALYSIS-----


##Table A1----
#Descriptive Statistics, Agency Level Characteristics

surv_tabl <- gtfsx %>% select(indicator,gtfsrt, revenue13, mile13, pop13, orgtype, PC13)
#add labels
labs <- data.frame(indicator = 'GTFS-s Adoption',
                   gtfsrt = 'GTFS-r Adoption',
                   orgtype = 'Organization Type',
                   PC13 = 'Principal City Indicator (2013)',
                   mile13 = 'Service Area Sq. Miles (2013)',
                   pop13 = 'Population (2013)',
                   revenue13 ="Revenue (2013)"
                   )
st(surv_tabl, labels=labs, file = "~/Desktop/TableA1.html", summ = c('length(x)', 'mean(x)', 'min(x)','max(x)'))


##Table A2----

acstbl <- gtfs19 %>% select(updates, income, age, pc_nonwhite, pc_employed,pc_internet, orgtype, PC18,pop, mile, revenue)
#add labels
labs <- data.frame(updates = 'GTFS Updates (Monthly)',
                   income = 'Median Household Income',
                   age = 'Median Age',
                   pc_nonwhite = 'Percent Non-white Population',
                   pc_employed = 'Percent Unemployed',
                   pc_internet = 'Percent Population with Internet Subscription',
                   orgtype = 'Organization Type',
                   PC18 = 'Principal City Indicator (2018)',
                   mile = 'Service Area Sq. Miles (2018)',
                   pop = 'Population (2018)',
                   revenue = 'Revenue (2018)'
)
st(acstbl, labels=labs, file = "~/Desktop/TableA2.html", summ = c('length(x)', 'mean(x)', 'min(x)','max(x)') )


##Table A3----

rtpd <- read_excel("rtpd.xlsx")
rtpdtbl <- rtpd %>% select(OrgType, pop21, mile21, revenue21)
#add labels
labs <- data.frame(OrgType = 'Organization Type',
                   mile21 ='Service Area Sq. Miles (2021)',
                   pop21 = 'Population (2021)',
                   revenue21 = 'Revenue (2021)'
                   
)
st(rtpdtbl, labels=labs, file = "~/Desktop/TableA3.html", summ = c('length(x)', 'mean(x)', 'min(x)','max(x)') )


##Table A4----
gtfsx$orgtype <- relevel(as.factor(gtfsx$orgtype), ref = "Local Government")

#2013
mod.gtfs1 = coxph(Surv(time, indicator) ~
                    orgtype + PC13 + revlog13,
                  data=gtfsx)
mod.gtfs2 = coxph(Surv(time, indicator) ~
                    orgtype + PC13 + milelog13,  
                  data=gtfsx)
mod.gtfs3 = coxph(Surv(time, indicator) ~
                    orgtype +PC13+ poplog13 ,
                  data=gtfsx)
mod.gtfs4 =  coxph(Surv(time, indicator) ~
                    orgtype+ PC13+    poplog13 + milelog13 + revlog13,
                  data=gtfsx)

stargazer(mod.gtfs1, mod.gtfs2, mod.gtfs3, mod.gtfs4, dep.var.labels=c("Time to GTFS Adoption (2013 - 2021)<sup>1</sup>"),digits = 3, notes.append = FALSE, omit.stat = c("ser"),df=FALSE,
          covariate.labels = c("Org. Type: Independent Public Agency<sup>2</sup>", "Org Type: Other" ,"Principal City<sup>3</sup><em>" ,"Revenue (log)", "Service Area (log)", "Service Population (log)" ), notes.align = "l", 
          notes = c("<sup>*</sup>p<0.1; <sup>**</sup>p<0.05; <sup>***</sup>p<0.01. All models estimated using Cox Proportional Hazards.", "<sup>1</sup>Number of years prior to GTFS adoption for each agency. Standard errors are corrected for heteroskedasticity (HC2).", "<sup>2</sup><em>Organization Type Reference Category</em>: Local Government", "<sup>3</sup><em>Reference Category</em>: Non-Principal City"),
          out = "~/Desktop/GTFSA4.html")


##Table A5----
#Agency Characteristics (IV) and Maintenance (DV)

fe1 <- lm(updates ~orgtype+PC18+log(revenue) +factor(yrmo) , data = gtfs19)
vcov1=cluster.vcov(fe1,cluster=gtfs19$NTDID) 
fe2 <- lm(updates ~ orgtype+ PC18+milelog +factor(yrmo) , data = gtfs19)
vcov2=cluster.vcov(fe2,cluster=gtfs19$NTDID) 
fe3 <- lm(updates ~  orgtype + PC18+ poplog+factor(yrmo), data = gtfs19)
vcov3=cluster.vcov(fe3,cluster=gtfs19$NTDID) 
fe4 <- lm(updates ~orgtype+PC18+ poplog+milelog+log(revenue) +factor(yrmo) , data = gtfs19)
vcov4=cluster.vcov(fe4,cluster=gtfs19$NTDID) 

stargazer(fe1, fe2, fe3, fe4,  omit = c("yrmo", "NTDID"), dep.var.labels=c("GTFS Maintenance (2019 - 2021)<sup>1</sup>"),  
          covariate.labels = c("Org. Type: Independent Public Agency<sup>2</sup>", "Org Type: Other" ,"Principal City<sup>3</sup>", "Revenue (log)","Service Area (log)","Service Population (log)"),
          notes.align = "l", notes.append =  FALSE, df= FALSE,omit.stat = c("ser"),
          notes = c("<sup>*</sup>p<0.1; <sup>**</sup>p<0.05; <sup>***</sup>p<0.01. All models estimated using month-level fixed effects, with standard errors clustered at the agency level.", "<sup>1</sup> Number of GTFS updates per agency-month.", "<sup>2</sup><em> Organization Type Reference Category</em>: Local Government", "<sup>3</sup><em>Reference Category</em>: Non-Principal City"),
          se=list(sqrt(diag(vcov1)),sqrt(diag(vcov2)),sqrt(diag(vcov3)), sqrt(diag(vcov4))),out = "~/Desktop/GTFSA5.html")

##Table A6----
#Demographics (IV) and Maintenance (DV)

fe1 <- lm(updates ~pc_nonwhite +factor(yrmo), data = gtfs19)
vcov1=cluster.vcov(fe1,cluster=gtfs19$NTDID) 
fe2 <- lm(updates ~  logHHI +factor(yrmo), data = gtfs19)
vcov2=cluster.vcov(fe2,cluster=gtfs19$NTDID) 
fe3 <- lm(updates ~ +age +factor(yrmo), data = gtfs19)
vcov3=cluster.vcov(fe3,cluster=gtfs19$NTDID) 
fe4 <- lm(updates ~ pc_nonwhite+logHHI +age +pc_internet+pc_employed+factor(yrmo), data = gtfs19)
vcov4=cluster.vcov(fe4,cluster=gtfs19$NTDID) 
stargazer(fe1, fe2, fe3, fe4, omit = c("yrmo"),dep.var.labels=c("GTFS Maintenance (2019 - 2021)<sup>1</sup>"), se=list(sqrt(diag(vcov1)), sqrt(diag(vcov2)),sqrt(diag(vcov3)), sqrt(diag(vcov4))),
          covariate.labels = c("Percent Non-white Pop." ,"Median HHI", "Median Age", "Internet Access", "Unemployment Rate"), 
          notes.align = "l",notes.append = FALSE, df= FALSE,
          notes = c("<sup>*</sup>p<0.1; <sup>**</sup>p<0.05; <sup>***</sup>p<0.01. All models estimated using month-level fixed effects, with standard errors clustered at the agency level.", "<sup>1</sup> Number of GTFS updates per agency-month."), out = "~/Desktop/GTFSA6.html")

##Table SI.1-----
riders <- read_excel("riders.xlsx")
riders <- within(riders, orgtype <- relevel(as.factor(orgtype), ref = "Local Government"))
riders$pct_change=ifelse(riders$pct_change == "Inf", NA, riders$pct_change)
riders$pct_change=ifelse(riders$pct_change == 100, NA, riders$pct_change)
riders$pct_change=ifelse(riders$pct_change == -100, NA, riders$pct_change)
riders$abspct_change=ifelse(riders$abspct_change == "Inf", NA, riders$abspct_change)
riders$abspct_change=ifelse(riders$abspct_change == "Inf", NA, riders$abspct_change)
riders$logpass=ifelse(riders$logpass == "Inf", NA, riders$logpass)
riders$logpass=ifelse(riders$logpass == "-Inf", NA, riders$logpass)

ride_tab <- riders %>% select(updates, lagUPT, pct_change, abspct_change)
#add labels
labs <- data.frame(
  updates="GTFS Updates",
  lagUPT='Ridership (# Unliked Passenger Trips)',
  pct_change='% Change Ridership',
  abspct_change='Absolute Value, % Change Ridership')
st(ride_tab, labels=labs, file = "~/Desktop/TableSI.1.html", summ = c('length(x)','mean(x)', 'min(x)','max(x)'))


##Table SI.2----
#Demographics (IV 1), Agency Characteristics (IV 2), and Maintenance (DV)

fe1 <- lm(updates ~orgtype + PC18+log(revenue)  + pc_nonwhite+logHHI +age +pc_employed+pc_internet+factor(yrmo), data = gtfs19)
vcov1=cluster.vcov(fe1,cluster=gtfs19$NTDID) 
fe2 <- lm(updates  ~orgtype + PC18 +poplog + pc_nonwhite+logHHI +age +pc_employed+pc_internet+factor(yrmo), data = gtfs19)
vcov2=cluster.vcov(fe2,cluster=gtfs19$NTDID) 
fe3 <- lm(updates ~  orgtype + PC18 +milelog + pc_nonwhite+logHHI +age +pc_employed+pc_internet+factor(yrmo), data = gtfs19)
vcov3=cluster.vcov(fe3,cluster=gtfs19$NTDID) 
fe4 <- lm(updates ~  orgtype + PC18+log(revenue)+poplog +milelog + pc_nonwhite+logHHI +age +pc_employed+pc_internet+factor(yrmo), data = gtfs19)
vcov4=cluster.vcov(fe4,cluster=gtfs19$NTDID) 
stargazer(fe1, fe2, fe3, fe4, omit = c("yrmo"), se=list(sqrt(diag(vcov1)),
                                                        sqrt(diag(vcov2)),sqrt(diag(vcov3)), sqrt(diag(vcov4))), df= FALSE,omit.stat = c("ser"),
          dep.var.labels=c("GTFS Maintenance (2019 - 2021)<sup>1</sup>"), covariate.labels = c("Org. Type: Independent Public Agency<sup>2</sup>", "Org Type: Other" ,"Principal City<sup>3</sup>",  "Revenue (log)", "Service Population (log)", "Service Area (log)","Percent Non-white Pop.","Median HHI",
                                                                                               "Median Age","Internet Access", "Unemployment Rate"), notes.align = "l", notes.append = FALSE,
          notes = c("<sup>*</sup>p<0.1; <sup>**</sup>p<0.05; <sup>***</sup>p<0.01. All models estimated using month-level fixed effects, with standard errors clustered at the agency level.",
                    "<sup>1</sup> Number of GTFS updates per agency-month.", "<sup>2</sup><em> Organization Type Reference Category</em>: Local Government", "<sup>3</sup><em>Reference Category</em>: Non-Principal City"),no.space = TRUE, out = "~/Desktop/GTFSSI.2.html")


##Table SI.3----
#Maximum Demographics (IV) and Maintenance (DV)
gtfsmax <- read_excel("gtfsmax.xlsx")
gtfsmax <- subset(gtfsmax, gtfsmax$year < 2022)

fe1 <- lm(updates ~pc_nonwhite +factor(yrmo), data = gtfsmax)
vcov1=cluster.vcov(fe1,cluster=gtfsmax$NTDID) 
fe2 <- lm(updates ~  logHHI +factor(yrmo), data = gtfsmax)
vcov2=cluster.vcov(fe2,cluster=gtfsmax$NTDID) 
fe3 <- lm(updates ~   age +factor(yrmo), data = gtfsmax)
vcov3=cluster.vcov(fe3,cluster=gtfsmax$NTDID) 
fe4 <- lm(updates ~ pc_nonwhite+logHHI +age +pc_internet+ pc_employed+factor(yrmo), data = gtfsmax)
vcov4=cluster.vcov(fe4,cluster=gtfsmax$NTDID) 
stargazer(fe1, fe2, fe3, fe4, omit = c("yrmo"),dep.var.labels=c("GTFS Maintenance (2019 - 2021)<sup>1</sup>"), se=list(sqrt(diag(vcov1)), sqrt(diag(vcov2)),sqrt(diag(vcov3)), sqrt(diag(vcov4))),
          notes.align = "l",notes.append = FALSE,df= FALSE,omit.stat = c("ser"),
          notes = c("<sup>*</sup>p<0.1; <sup>**</sup>p<0.05; <sup>***</sup>p<0.01. All models estimated using month-level fixed effects, with standard errors clustered at the agency level.", "<sup>1</sup> Number of GTFS updates per agency-month."),
          covariate.labels = c("Percent Non-white Pop.","Median HHI", "Median Age","Internet Access", "Unemployment Rate"), out = "~/Desktop/GTFSSI.3.html") 


##Table SI.4----
#Minimum Demographics (IV) and Maintenance (DV)
gtfsmin <- read_excel("gtfsmin.xlsx")
gtfsmin <- subset(gtfsmin, gtfsmin$year < 2022)
fe1 <- lm(updates ~pc_nonwhite+factor(yrmo), data = gtfsmin)
vcov1=cluster.vcov(fe1,cluster=gtfsmin$NTDID) 
fe2 <- lm(updates ~ logHHI  +factor(yrmo), data = gtfsmin)
vcov2=cluster.vcov(fe2,cluster=gtfsmin$NTDID) 
fe3 <- lm(updates ~  age +factor(yrmo), data = gtfsmin)
vcov3=cluster.vcov(fe3,cluster=gtfsmin$NTDID) 
fe4 <- lm(updates ~ pc_nonwhite+logHHI +age +pc_internet+pc_employed+factor(yrmo), data = gtfsmin)
vcov4=cluster.vcov(fe4,cluster=gtfsmin$NTDID) 
stargazer(fe1, fe2, fe3, fe4, omit = c("yrmo"),dep.var.labels=c("GTFS Maintenance (2019 - 2021)<sup>1</sup>"), se=list(sqrt(diag(vcov1)), sqrt(diag(vcov2)),sqrt(diag(vcov3)), sqrt(diag(vcov4))),
          notes.align = "l",df= FALSE,omit.stat = c("ser"),
          notes = c("<sup>*</sup>p<0.1; <sup>**</sup>p<0.05; <sup>***</sup>p<0.01. All models estimated using month-level fixed effects, with standard errors are clustered at the agency level.", "<sup>1</sup> Number of GTFS updates per agency-month."),
          covariate.labels = c("Percent Non-white Pop.","Median HHI", "Median Age","Internet Access", "Unemployment Rate"), out = "~/Desktop/GTFSSI.4.html") 

##Table SI.5----


riders2=subset(riders, riders$NTDID %in%gtfs19$NTDID) # Subset to years in GTFS longitudinal analysis to evaluate revenue change during pandemic
riders2=subset(riders2, riders2$year > 2018)
fe1 <- lm(updates ~pct_change  +factor(NTDID), data=riders2)
vcov1=cluster.vcov(fe1,cluster=riders2$NTDID)
fe2 <- lm(updates ~abspct_change  +factor(NTDID), data = riders2)
vcov2=cluster.vcov(fe2,cluster=riders2$NTDID)
fe3 <- lm(updates ~logpass  +factor(NTDID), data = riders2)
vcov3=cluster.vcov(fe3,cluster=riders2$NTDID)
stargazer(fe1, fe2, fe3, omit = c("NTDID"), covariate.labels = c("Ridership: Percent Change", "Ridership: Abs. Value % Change", "Ridership: Log Value"),
          dep.var.labels=c("GTFS Maintenance (2019 - 2021)<sup>1</sup>"), se=list(sqrt(diag(vcov1)), sqrt(diag(vcov2)), sqrt(diag(vcov3))),
          notes.align = "l", df=FALSE, notes.append = FALSE,omit.stat = c("ser"),
          notes = c("<sup>*</sup>p<0.1; <sup>**</sup>p<0.05; <sup>***</sup>p<0.01. All models estimated using agency-level fixed effects, with standard errors clustered at the agency level.", "<sup>1</sup> Number of GTFS updates per agency-month."),
          out = "~/Desktop/GTFSSI.5.html")

##Table SI.6 ----
#GTFS Adotpion (IV) and Ridership (DV)

fe1 <- lm(abspct_change ~ lagadopt +factor(yrmo), data =riders)
vcov1=cluster.vcov(fe1,cluster=riders$NTDID) 
fe2 <- lm(pct_change ~ lagadopt+factor(yrmo), data = riders)
vcov2=cluster.vcov(fe2,cluster=riders$NTDID) 
fe3 <- lm(logpass ~  lagadopt+factor(yrmo), data = riders)
vcov3=cluster.vcov(fe3,cluster=riders$NTDID) 
stargazer(fe1, fe2, fe3, omit = c("yrmo", "NTDID"),dep.var.labels=c("Absolute Value % Change Ridership", "% Change Ridership", "Ridership: Log Value"), se=list(sqrt(diag(vcov1)), sqrt(diag(vcov2)),sqrt(diag(vcov3))),
          notes.align = "l",df= FALSE,omit.stat = c("ser"),
          notes = c("<sup>*</sup>p<0.1; <sup>**</sup>p<0.05; <sup>***</sup>p<0.01. All models estimated using month level fixed effects.", "<sup>1</sup> Number of GTFS updates per agency-month. Standard errors are clustered at the agency level."),
          covariate.labels = c("GTFS Adoption"), out = "~/Desktop/GTFSSI.6.html") 


##Table SI.7----
#Agency Characteristics (IV) and GTFS Realtime (DV)
a=lm(gtfsrt~orgtype+PC20+ revlog21, data=gtfsx) 
av=vcov(a, type="HC2")
b=lm(gtfsrt~orgtype+PC20+milelog21, data=gtfsx) 
bv=vcov(b, type="HC2")
c=lm(gtfsrt~orgtype+PC20+poplog21, data=gtfsx) 
cv=vcov(c, type="HC2")
d=lm(gtfsrt~orgtype+PC20 + revlog21+milelog21+poplog21, data=gtfsx) 
dv=vcov(d, type="HC2")
stargazer(a,b,c,d,
          se=list(sqrt(diag(av)),sqrt(diag(bv)),sqrt(diag(cv)),sqrt(diag(dv))),dep.var.labels=c("GTFS-RT Adoption (2022)<sup>1</sup>"),
          covariate.labels = c("Org. Type: Independent Public Agency<sup>2</sup>", "Org Type: Other" , "Principal City (2020)<sup>3</sup>" ,"Revenue (log) (2021)","Service Area (log) (2021)",  "Service Population (log) (2021)"),
          type='html',omit.stat = "f",no.space = TRUE, notes.align = "l", notes.append = FALSE,df= FALSE,
          notes = c("<sup>*</sup>p<0.1; <sup>**</sup>p<0.05; <sup>***</sup>p<0.01. All models estimated using Ordinary Least Squares regression.", "<sup>1</sup> Standard errors are corrected for heteroskedasticity (HC2).", "<sup>2</sup><em>Organization Type Reference Category</em>: Local Government", "<sup>3</sup><em> Reference Category</em>: Non-Principal City"),
          out = "~/Desktop/GTFSSI.7.html")




##Table SI.8----

#clean things up, alphabetize output for word table

#cities
#clean things up, alphabetize output for word table
principal_cities <- gtfsx$City[gtfsx$PC13 == "Principal City"]

cleaned_cities <- str_to_title(principal_cities)
# Remove duplicates and sort alphabetically
unique_sorted_cities <- sort(unique(cleaned_cities))
# Combine into a single string separated by commas
output_string <- paste(unique_sorted_cities, collapse = ", ")
# Print the result
cat(output_string)

#transit agencies
principal_cities <- gtfsx$AgencyName[gtfsx$PC13 == "Principal City"]
cleaned_cities <- str_to_title(principal_cities)
cleaned_cities <- str_replace_all(cleaned_cities, "\\bOf\\b", "of")
# Remove duplicates and sort alphabetically
unique_sorted_cities <- sort(unique(cleaned_cities))
# Combine into a single string separated by commas
output_string <- paste(unique_sorted_cities, collapse = ", ")
# Print the result
cat(output_string)

#for non-principal cities

principal_cities <- gtfsx$City[gtfsx$PC13 == "Non-Principal City"]
cleaned_cities <- str_to_title(principal_cities)
# Remove duplicates and sort alphabetically
unique_sorted_cities <- sort(unique(cleaned_cities))
# Combine into a single string separated by commas
output_string <- paste(unique_sorted_cities, collapse = ", ")
# Print the result
cat(output_string)

#transit agencies

principal_cities <- gtfsx$AgencyName[gtfsx$PC13 == "Non-Principal City"]
cleaned_cities <- str_to_title(principal_cities)
cleaned_cities <- str_replace_all(cleaned_cities, "\\bOf\\b", "of")
# Remove duplicates and sort alphabetically
unique_sorted_cities <- sort(unique(cleaned_cities))
# Combine into a single string separated by commas
output_string <- paste(unique_sorted_cities, collapse = ", ")
# Print the result
cat(output_string)


##Table SI.9----

#2016 Covariates

mod.gtfs1 = coxph(Surv(time, indicator) ~
                    orgtype + PC15 + revlog16,
                  data=gtfsx)
mod.gtfs2 = coxph(Surv(time, indicator) ~
                    orgtype + PC15 + milelog16,  
                  data=gtfsx)
mod.gtfs3 = coxph(Surv(time, indicator) ~
                    orgtype +PC15+ poplog16 ,
                  data=gtfsx)
mod.gtfs4 = coxph(Surv(time, indicator) ~
                    orgtype+ PC15+    poplog16 + milelog16 +revlog16,
                  data=gtfsx)

stargazer(mod.gtfs1, mod.gtfs2, mod.gtfs3, mod.gtfs4, dep.var.labels=c("Time to GTFS Adoption (2013 - 2021)<sup>1</sup>"),digits = 3, notes.append = FALSE, omit.stat = c("ser"),df=FALSE,
          covariate.labels = c("Org. Type: Independent Public Agency<sup>2</sup>", "Org Type: Other" ,"Principal City<sup>3</sup><em>" ,"Revenue (log)", "Service Area (log)", "Service Population (log)" ), notes.align = "l", 
          notes = c("<sup>*</sup>p<0.1; <sup>**</sup>p<0.05; <sup>***</sup>p<0.01. All models estimated using Cox Proportional Hazards.", "<sup>1</sup>Number of years prior to GTFS adoption for each agency. Standard errors are corrected for heteroskedasticity (HC2).", "<sup>2</sup><em>Organization Type Reference Category</em>: Local Government", "<sup>3</sup><em>Reference Category</em>: Non-Principal City"),
          out = "~/Desktop/GTFSSI.9.html")

